Introduction

In a prior post (LINK HERE), I began this project: a bit of a personal workshop in learning how to use Stan for spatial modeling. In that post, I did some exploratory data analysis to set up the overarching idea of the project, which is to (a) follow the example in LINK PAPER and use an ICAR model to model spatial dependencies in crashes across NYC and (b) potentially add a treatment/difference in difference component to this model to understand whether the congestion pricing policy has an effect on the number of vehicle crashes. In the interest of breaking this project into digestible chunks, I have decided that this, part 2 of 3, will focus only on the spatial modeling part. To that end, I’ll be basically following this GUIDE from Mitzi Morris.

# Loading NYC Census Tracts: https://data.cityofnewyork.us/City-Government/2020-Census-Tracts/63ge-mke6/about_data 
nyc_tracts <- read.csv("data/2020_Census_Tracts_20250606.csv", stringsAsFactors = FALSE) %>%
  rename("geometry" = "the_geom")
nyc_tracts <- st_as_sf(nyc_tracts, wkt = "geometry", crs = 4326)
nyc_tracts <- nyc_tracts %>%
  mutate(
    area_sqm = Shape_Area / 27878400
  ) %>%
  select(
    geometry,
    BoroCT2020,
    BoroCode,
    BoroName,
    area_sqm,
    GEOID
  )

# Loading collisions data: https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95/about_data 
filter <- c(0, NA)

crashes <- read.csv("data/Motor_Vehicle_Collisions_-_Crashes_20250605.csv") %>%
  select(CRASH.DATE,
         LATITUDE,
         LONGITUDE,
         NUMBER.OF.PERSONS.INJURED,
         NUMBER.OF.PERSONS.KILLED,
         NUMBER.OF.PEDESTRIANS.INJURED,
         NUMBER.OF.PEDESTRIANS.KILLED) %>%
  filter(! LATITUDE %in% filter,
         ! LONGITUDE %in% filter) %>%
  rename(
    "date" = "CRASH.DATE",
    "lat" = "LATITUDE",
    "long" = "LONGITUDE",
    "persons_inj" = "NUMBER.OF.PERSONS.INJURED",
    "persons_death" = "NUMBER.OF.PERSONS.KILLED",
    "ped_inj" = "NUMBER.OF.PEDESTRIANS.INJURED",
    "ped_death" = "NUMBER.OF.PEDESTRIANS.KILLED"
  ) %>%
  mutate(
    date = mdy(date)
    ) %>%
  st_as_sf(coords = c("long","lat"), crs = 4326)

# Grabbing some census variables via Tidycensus, using Keyring for API Key privacy
tidycensus_api_key <- key_get(service = "tidycensus_API", username = "my_tidycensus")
census_api_key(tidycensus_api_key)

census_vars <- get_acs(state = "NY",
                       county = c("Bronx", "Kings", "New York", "Queens", "Richmond"),
                       geography = "tract",
                       variables = c(medincome = "B19013_001",
                                     population = "B01003_001",
                                     median_age = "B01002_001",
                                     transport_baseline = "B08301_001",
                                     transport_pubtransit = "B08301_010"),
                       geometry = FALSE,
                       keep_geo_vars = FALSE,
                       year = 2022,
                       output = "wide"
                       ) %>%
  mutate(
    GEOID = as.numeric(GEOID),
    median_income = medincomeE,
    population = populationE,
    median_age = median_ageE,
    prop_pubtransit = transport_pubtransitE / transport_baselineE
  ) %>%
  select(
    GEOID,
    median_income,
    population,
    median_age,
    prop_pubtransit
  )

# Associating CP Zone, Pre/Post, Treatment, Borough, and Census Tract w/ Observations
crashes <- crashes %>%
  st_join(nyc_tracts, join = st_within) %>%
  filter(! is.na(area_sqm))

# Aggregating/summarizing data to census tract level, no time component
crashes_tract <- crashes %>%
  group_by(BoroCT2020) %>%
  summarize(tot_crashes = n(),
            area = mean(area_sqm)) %>%
  ungroup() %>%
  st_drop_geometry()

# Getting Fragmentation Index from: https://github.com/mitzimorris/geomed_2024/blob/main/data/nyc_study.geojson

frag_data = st_read(file.path("data", "nyc_study.geojson"), quiet = TRUE) %>%
  st_drop_geometry() %>%
  select(BoroCT2010, frag_index, traffic) %>%
  mutate(
    BoroCT2010 = as.numeric(BoroCT2010)
  )

# Joining everything together, selecting only variables that I want
crashes_tracts_geo <- nyc_tracts %>%
  right_join(crashes_tract, 
             by = "BoroCT2020") %>%
  left_join(census_vars,
            by = "GEOID") %>%
  left_join(frag_data,
            by = c("BoroCT2020" = "BoroCT2010")) %>%
  mutate(
    pop_per_sqm = population / area_sqm
  ) %>%
  select(! c(area, population))

# Interactive Data Table for Display
crashes_dt <- crashes_tracts_geo %>%
  st_drop_geometry() %>%
  mutate(
    area_sqm = round(area_sqm, 3),
    prop_pubtransit = round(prop_pubtransit, 3),
    pop_per_sqm = round(pop_per_sqm, 3),
  )

datatable(crashes_dt,
          extensions = 'Buttons',
          filter = "top",
          rownames = FALSE,
          options = list(
            autoWidth = TRUE,
            scrollX = TRUE
          ),
  class = 'compact',
  escape = FALSE
) %>%
  formatStyle(
    columns = names(crashes_dt),
    `white-space` = "nowrap",
    `height` = "1.5em",
    `line-height` = "1.5em"
  )

More Plots

Histogram of Collisions per Square Mile

ggplot(data = crashes_tracts_geo, aes(x = tot_crashes)) + 
  geom_histogram(aes(y = after_stat(density)), 
                 bins = 500, 
                 color = "lightblue",
                 fill = "lightblue",
                 alpha = 0.75) + 
  geom_density(color = "#FFC600", linewidth = 1) + 
  theme_bw() + 
  labs(title = "Distribution Total Crashes per Census Tract (2024-25)",
       x = "Number of Crashes",
       y = "Density")

Choropleth Maps

Map of Collisions

crash_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "tot_crashes",
              fill.scale = tm_scale_intervals(values = "brewer.yl_or_rd",
                                              style = "jenks"),
              fill.legend = tm_legend(title = "Total Crashes, 2024-25"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

crash_map

Predictor Maps

Median Income // Median Age

income_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "median_income",
              fill.scale = tm_scale_intervals(values = "brewer.greens",
                                              style = "jenks"),
              fill.legend = tm_legend(title = "Median Income"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

medianage_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "median_age",
              fill.scale = tm_scale_intervals(values = "brewer.blues",
                                              style = "jenks"),
              fill.legend = tm_legend(title = "Median Age"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

tmap_arrange(income_map,
             medianage_map,
             nrow = 1, ncol = 2)

Proportion Using Public Transit to Commute // Population per Square Mile

prop_pubtransit_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "prop_pubtransit",
              fill.scale = tm_scale_intervals(values = "brewer.purples",
                                              style = "jenks",
                                              value.na = "grey"),
              fill.legend = tm_legend(title = "Proportion that Commute Using Public Transit"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

pop_per_sqm_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "pop_per_sqm",
              fill.scale = tm_scale_intervals(values = "brewer.yl_gn",
                                              style = "jenks"),
              fill.legend = tm_legend(title = "Population per Square Mile"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

tmap_arrange(prop_pubtransit_map,
             pop_per_sqm_map,
             nrow = 1, ncol = 2)

Fragmentation Index // Traffic

frag_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "frag_index",
              fill.scale = tm_scale_intervals(values = "brewer.pu_or",
                                              style = "quantile",
                                              midpoint = 0,
                                              value.na = "grey"),
              fill.legend = tm_legend(title = "Fragmentation Index"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

traffic_map <- tm_shape(crashes_tracts_geo) + 
  tm_polygons(fill = "traffic",
              fill.scale = tm_scale_intervals(values = "brewer.oranges",
                                              style = "quantile",
                                              midpoint = 0,
                                              value.na = "grey"),
              fill.legend = tm_legend(title = "Traffic"),
              lwd = 0.3) +
  tm_layout(legend.outside = TRUE, 
            legend.outside.position = "right")

tmap_arrange(frag_map,
             traffic_map,
             nrow = 1, ncol = 2)

Moving from a Map to Spatial Dependencies

nyc_nbs = poly2nb(crashes_tracts_geo)
## Warning in poly2nb(crashes_tracts_geo): some observations have no neighbours;
## if this seems unexpected, try increasing the snap argument.
## Warning in poly2nb(crashes_tracts_geo): neighbour object has 4 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
nyc_coords = st_coordinates(st_centroid(crashes_tracts_geo['geometry']))
plot(st_geometry(crashes_tracts_geo), col='lightgrey')
plot(nyc_nbs, coords=nyc_coords, col='deepskyblue3', add=TRUE, pch=20, cex=0.6)